import pandas as pd
import numpy as np
import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, accuracy_score

import networkx as nx

fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain

	trans_date_trans_time	cc_num	merchant	category	amt	first	last	gender	street	city	...	lat	long	city_pop	job	dob	trans_num	unix_time	merch_lat	merch_long	is_fraud
0	2019-01-01 00:00:00	2.703190e+15	fraud_Rippin, Kub and Mann	misc_net	4.97	Jennifer	Banks	F	561 Perry Cove	Moravian Falls	...	36.0788	-81.1781	3495	Psychologist, counselling	1988-03-09	0b242abb623afc578575680df30655b9	1325376018	36.011293	-82.048315	0
1	2019-01-01 00:00:00	6.304230e+11	fraud_Heller, Gutmann and Zieme	grocery_pos	107.23	Stephanie	Gill	F	43039 Riley Greens Suite 393	Orient	...	48.8878	-118.2105	149	Special educational needs teacher	1978-06-21	1f76529f8574734946361c461b024d99	1325376044	49.159047	-118.186462	0
2	2019-01-01 00:00:00	3.885950e+13	fraud_Lind-Buckridge	entertainment	220.11	Edward	Sanchez	M	594 White Dale Suite 530	Malad City	...	42.1808	-112.2620	4154	Nature conservation officer	1962-01-19	a1a22d70485983eac12b5b88dad1cf95	1325376051	43.150704	-112.154481	0
3	2019-01-01 00:01:00	3.534090e+15	fraud_Kutch, Hermiston and Farrell	gas_transport	45.00	Jeremy	White	M	9443 Cynthia Court Apt. 038	Boulder	...	46.2306	-112.1138	1939	Patent attorney	1967-01-12	6b849c168bdad6f867558c3793159a81	1325376076	47.034331	-112.561071	0
4	2019-01-01 00:03:00	3.755340e+14	fraud_Keeling-Crist	misc_pos	41.96	Tyler	Garcia	M	408 Bradley Rest	Doe Hill	...	38.4207	-79.4629	99	Dance movement psychotherapist	1986-03-28	a41d7549acf90789359a9aa5346dcb46	1325376186	38.674999	-78.632459	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1048570	2020-03-10 16:07:00	6.011980e+15	fraud_Fadel Inc	health_fitness	77.00	Haley	Wagner	F	05561 Farrell Crescent	Annapolis	...	39.0305	-76.5515	92106	Accountant, chartered certified	1943-05-28	45ecd198c65e81e597db22e8d2ef7361	1362931649	38.779464	-76.317042	0
1048571	2020-03-10 16:07:00	4.839040e+15	fraud_Cremin, Hamill and Reichel	misc_pos	116.94	Meredith	Campbell	F	043 Hanson Turnpike	Hedrick	...	41.1826	-92.3097	1583	Geochemist	1999-06-28	c00ce51c6ebb7657474a77b9e0b51f34	1362931670	41.400318	-92.726724	0
1048572	2020-03-10 16:08:00	5.718440e+11	fraud_O'Connell, Botsford and Hand	home	21.27	Susan	Mills	F	005 Cody Estates	Louisville	...	38.2507	-85.7476	736284	Engineering geologist	1952-04-02	17c9dc8b2a6449ca2473726346e58e6c	1362931711	37.293339	-84.798122	0
1048573	2020-03-10 16:08:00	4.646850e+18	fraud_Thompson-Gleason	health_fitness	9.52	Julia	Bell	F	576 House Crossroad	West Sayville	...	40.7320	-73.1000	4056	Film/video editor	1990-06-25	5ca650881b48a6a38754f841c23b77ab	1362931718	39.773077	-72.213209	0
1048574	2020-03-10 16:08:00	2.283740e+15	fraud_Buckridge PLC	misc_pos	6.81	Shannon	Williams	F	9345 Spencer Junctions Suite 183	Alpharetta	...	34.0770	-84.3033	165556	Prison officer	1997-12-27	8d0a575fe635bbde12f1a2bffc126731	1362931730	33.601468	-83.891921	0

1048575 rows × 22 columns

함수 만들기

ref: https://guebin.github.io/PP2023/posts/01_PythonBasic/2023-04-05-5wk-2.html

https://guebin.github.io/PP2023/posts/Appendix/2022-06-14-final.html#%EA%B0%80%EC%9C%84-%EB%B0%94%EC%9C%84-%EB%B3%B4-%ED%95%98%EB%82%98%EB%B9%BC%EA%B8%B0-150%EC%A0%90

Class Metting0115:   
    def throw(df, fraud_rate):  # 사기 거래 비율에 맞춰 버려지는 함수!
        df1 = df[df['is_fraud'] == 1].copy()
        df0 = df[df['is_fraud'] == 0].copy()
        df0_downsample = (len(df1) * (1-fraud_rate)) / (len(df0) * fraud_rate)
        df0_down = df0.sample(frac=df0_downsample, random_state=42)
        df_p = pd.concat([df1, df0_down])
        return df_p
    
    def split_dataframe(data_frame, test_fraud_rate, test_rate=0.3):
        n = len(data_frame)
    
        # 사기 거래와 정상 거래를 분리
        fraud_data = data_frame[data_frame['is_fraud'] == 1]
        normal_data = data_frame[data_frame['is_fraud'] == 0]

        # 테스트 데이터 크기 계산
        test_samples = int(test_fraud_rate * (n * test_rate))
        remaining_test_samples = int(n * test_rate) - test_samples
    
        # 사기 거래 및 정상 거래에서 무작위로 테스트 데이터 추출
        test_fraud_data = fraud_data.sample(n=test_samples, replace=False)
        test_normal_data = normal_data.sample(n=remaining_test_samples, replace=False)

        # 테스트 데이터 합치기
        test_data = pd.concat([test_normal_data, test_fraud_data])

        # 훈련 데이터 생성
        train_data = data_frame[~data_frame.index.isin(test_data.index)]

        return train_data, test_data
    
    def concat(df_tr, df_tst):   
        df = pd.concat([df_tr, df_tst])
        train_mask = np.concatenate((np.full(len(df_tr), True), np.full(len(df_tst), False)))    # index꼬이는거 방지하기 위해서? ★ (이거,, 훔,,?(
        test_mask =  np.concatenate((np.full(len(df_tr), False), np.full(len(df_tst), True))) 
        mask = (train_mask, test_mask)
        return df, mask
        
    def evaluation(y, yhat):
        metrics = [sklearn.metrics.accuracy_score,
                   sklearn.metrics.precision_score,
                   sklearn.metrics.recall_score,
                   sklearn.metrics.f1_score,
                   sklearn.metrics.roc_auc_score]
        return pd.DataFrame({m.__name__:[m(y,yhat).round(6)] for m in metrics})
        
    def generate_w(self): ★★ ... 
        
    def bipartite(df, node_1, node_2, graph_type=nx.Graph()):
        df=df.copy()
        mapping={x:node_id for node_id, x in enumerate(set(df[node_1].values.tolist()+\
                                                          df[node_2].values.tolist()))}
    
        df["from"]=df[node_1].apply(lambda x:mapping[x])  #엣지의 출발점
        df["to"]=df[node_2].apply(lambda x:mapping[x])  #엣지의 도착점
    
        df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
        df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
        G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
        nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")   
        nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") 

        return G
        
    def tripartite(df, node_1, node_2,  graph_type=nx.Graph()):
        df=df.copy()
        mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                           df[node_1].values.tolist() +
                                                           df[node_2].values.tolist()))}
        df["in_node"]= df[node_1].apply(lambda x: mapping[x])
        df["out_node"]=df[node_2].apply(lambda x:mapping[x])
    
        
        G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                            [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
        nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")     
        nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")   
        nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")  
        nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")

        return G

throw

def throw(df, fraud_rate):  # 사기 거래 비율에 맞춰 버려지는 함수!
    df1 = df[df['is_fraud'] == 1].copy()
    df0 = df[df['is_fraud'] == 0].copy()
    df0_downsample = (len(df1) * (1-fraud_rate)) / (len(df0) * fraud_rate)
    df0_down = df0.sample(frac=df0_downsample, random_state=42)
    df_p = pd.concat([df1, df0_down])
    return df_p

df = throw(fraudTrain, 0.5)

df

	trans_date_trans_time	cc_num	merchant	category	amt	first	last	gender	street	city	...	lat	long	city_pop	job	dob	trans_num	unix_time	merch_lat	merch_long	is_fraud
2449	2019-01-02 01:06:00	4.613310e+12	fraud_Rutherford-Mertz	grocery_pos	281.06	Jason	Murphy	M	542 Steve Curve Suite 011	Collettsville	...	35.9946	-81.7266	885	Soil scientist	1988-09-15	e8a81877ae9a0a7f883e15cb39dc4022	1325466397	36.430124	-81.179483	1
2472	2019-01-02 01:47:00	3.401870e+14	fraud_Jenkins, Hauck and Friesen	gas_transport	11.52	Misty	Hart	F	27954 Hall Mill Suite 575	San Antonio	...	29.4400	-98.4590	1595797	Horticultural consultant	1960-10-28	bc7d41c41103877b03232f03f1f8d3f5	1325468849	29.819364	-99.142791	1
2523	2019-01-02 03:05:00	3.401870e+14	fraud_Goodwin-Nitzsche	grocery_pos	276.31	Misty	Hart	F	27954 Hall Mill Suite 575	San Antonio	...	29.4400	-98.4590	1595797	Horticultural consultant	1960-10-28	b98f12f4168391b2203238813df5aa8c	1325473523	29.273085	-98.836360	1
2546	2019-01-02 03:38:00	4.613310e+12	fraud_Erdman-Kertzmann	gas_transport	7.03	Jason	Murphy	M	542 Steve Curve Suite 011	Collettsville	...	35.9946	-81.7266	885	Soil scientist	1988-09-15	397894a5c4c02e3c61c784001f0f14e4	1325475483	35.909292	-82.091010	1
2553	2019-01-02 03:55:00	3.401870e+14	fraud_Koepp-Parker	grocery_pos	275.73	Misty	Hart	F	27954 Hall Mill Suite 575	San Antonio	...	29.4400	-98.4590	1595797	Horticultural consultant	1960-10-28	7863235a750d73a244c07f1fb7f0185a	1325476547	29.786426	-98.683410	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
490138	2019-08-02 14:26:00	2.242540e+15	fraud_Kerluke-Abshire	shopping_net	226.12	Samuel	Jenkins	M	43235 Mckenzie Views Apt. 837	Westport	...	38.4921	-85.4524	564	Pensions consultant	1996-04-10	7f7585873fbe12b0aab9dc95ba3cecab	1343917615	37.706700	-85.806080	0
658275	2019-10-07 07:50:00	2.131640e+14	fraud_Erdman-Kertzmann	gas_transport	71.13	Mark	Tyler	M	82201 Bradley Radial Suite 703	Avera	...	33.1410	-82.5150	741	Claims inspector/assessor	1986-04-28	305f6d8297b81a36f7e57e10c1036451	1349596244	33.359566	-82.730195	0
767052	2019-11-24 15:38:00	4.464460e+12	fraud_Ritchie, Oberbrunner and Cremin	travel	2.00	Breanna	Rodriguez	F	118 Cabrera Springs Apt. 105	Lanark Village	...	29.8826	-84.5964	217	Television production assistant	1990-01-24	3fcaef8c9a2e3654b51eb0b0b84ff424	1353771522	29.239729	-84.247963	0
234186	2019-04-28 06:15:00	5.020130e+11	fraud_Kassulke PLC	shopping_net	202.12	Sherry	Martinez	F	144 Yu Locks Apt. 754	Garrattsville	...	42.6315	-75.1866	165	Naval architect	1945-09-20	dc14c572855f13df2e55e6e844b2dd89	1335593704	43.461382	-75.910293	0
340364	2019-06-09 21:16:00	2.297450e+15	fraud_Medhurst Inc	home	16.46	Laura	Walker	F	611 Michael Rue	Cisco	...	39.9972	-88.6962	478	Landscape architect	1960-01-13	e53dd35b00303fb8bb022f2f015b55ec	1339276596	39.827886	-88.134687	0

12012 rows × 22 columns

split_dataframe

def split_dataframe(data_frame, test_fraud_rate, test_rate=0.3):
    n = len(data_frame)
    
    # 사기 거래와 정상 거래를 분리
    fraud_data = data_frame[data_frame['is_fraud'] == 1]
    normal_data = data_frame[data_frame['is_fraud'] == 0]

    # 테스트 데이터 크기 계산
    test_samples = int(test_fraud_rate * (n * test_rate))
    remaining_test_samples = int(n * test_rate) - test_samples

    # 사기 거래 및 정상 거래에서 무작위로 테스트 데이터 추출
    test_fraud_data = fraud_data.sample(n=test_samples, replace=False)
    test_normal_data = normal_data.sample(n=remaining_test_samples, replace=False)

    # 테스트 데이터 합치기
    test_data = pd.concat([test_normal_data, test_fraud_data])

    # 훈련 데이터 생성
    train_data = data_frame[~data_frame.index.isin(test_data.index)]

    return train_data, test_data

df_tr, df_ts = split_dataframe(df, 0.3)

df_tr.is_fraud.mean(), df_ts.is_fraud.mean()

(0.5856820073730526, 0.3000277546489037)

len(df_tr)/len(df), len(df_ts)/len(df)

(0.7000499500499501, 0.2999500499500499)

concat

df_tr.shape, df_ts.shape

((8409, 22), (3603, 22))

def concat(df_tr, df_tst):  
    df = pd.concat([df_tr, df_tst])
    train_mask = np.concatenate((np.full(len(df_tr), True), np.full(len(df_tst), False)))    # index꼬이는거 방지하기 위해서?
    test_mask =  np.concatenate((np.full(len(df_tr), False), np.full(len(df_tst), True))) 
    mask = (train_mask, test_mask)
    return df, mask

- 예시

df, mask = concat(df_tr, df_ts)

mask

(array([ True,  True,  True, ..., False, False, False]),
 array([False, False, False, ...,  True,  True,  True]))

mask[1].sum()

evaluation

def evaluation(y, yhat):
    metrics = [sklearn.metrics.accuracy_score,
               sklearn.metrics.precision_score,
               sklearn.metrics.recall_score,
               sklearn.metrics.f1_score,
               sklearn.metrics.roc_auc_score]
    return pd.DataFrame({m.__name__:[m(y,yhat).round(6)] for m in metrics})

- 예시

y = [1, 0, 1, 1, 0, 1, 0, 0]
yhat = [1, 0, 1, 0, 1, 1, 0, 1]

evaluation(y,yhat)

	accuracy_score	precision_score	recall_score	f1_score	roc_auc_score
0	0.625	0.6	0.75	0.666667	0.625

generate_w

def generate_w(df, r, time):
    
    
    
    
    return W

아래와 같은 함수를 만들것

이름: 가중치생성

입력: df, gamma, df에서 time을 나타내는 col

출력: W (numpy matrix)

이런식의 행렬 만드려 했는데.. 메모리 부족으로 matrix 행렬을 생성할 수 없음 (밑에 예시는 교수님 toy example)

- 요거 세 개 합쳐야 W 나올듯

def compute_time_difference(df, unique_col):
    groups = df.groupby(unique_col)
    n = len(group)
    result = []
    for i in range(n):
        for j in range(n):
            time_difference = abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
            result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
    return result



def edge_index(df, unique_col):
    groups = df.groupby(unique_col)
    edge_index_list_plus = [compute_time_difference(group) for _, group in groups]
    edge_index_list_plus_flat = [item for sublist in edge_index_list_plus for item in sublist]
    edge_index_list_plus_nparr = np.array(edge_index_list_plus_flat)
    filename = f"edge_index_list_plus_{str(unique_col).replace(' ', '').replace('_', '')}.npy"          # 저장
    np.save(filename, edge_index_list_plus_nparr)
    return edge_index_list_plus_nparr


def edge_select(edge_index, gamma):
    edge_index[:,2] = (np.exp(-edge_index[:,2]/gamma) != 1)*(np.exp(-edge_index[:,2]/gamma))   ###### edge_select.... 
    edge_index = edge_index.tolist()
    
    return edge_index

- 의문..

현재 시간의 차이를 계산해서 gamma로 나누어 줌. 그래서 edge_index[:,2] 값이 클수록 시간이 가깝다는 뜻.. (exp(-edge/gamma) 하니까
그런데 edge_index의 shape은 (2, o) 형태여야 한단 말야 그럼 결국…… (i,j) 만 있어야 되니까 그중에 edge_select하는 방법을 생각했는데
그럼 time(weight)값은 어디로 간거?
여기서 어차피 할 수 있는건 edge_index를 고르는..작업을 하는건데 -> 여기서 고르는 방법을 퍼센테이지에 따라서 조정할 수 있을 거 같긴 함(지금은 걍 평균보다 큰거 고름)
GCNConv를 이용해서 나오는

위와 같은걸 함 봐ㅗㅂ고 싶은뎀

a = edge_index(df, 'cc_num')

a.shape

(199868, 3)

a[:,2].shape

(199868,)

edge_index2 = np.load('edge_index_list_plus_ccnum.npy').astype(np.float64)
edge_index2.shape

(199868, 3)

왜 다르징?

edge_index = np.load('edge_index_list_plus50.npy').astype(np.float64)
edge_index.shape

(200706, 3)

200706-199868

edge_index2[:,2].mean()

7902291948085736.0

bb =np.array(edge_select(a,edge_index2[:,2].mean()))

edge_index2

array([[1.024900e+05, 1.024900e+05, 0.000000e+00],
       [1.024900e+05, 1.025560e+05, 4.200000e+12],
       [1.024900e+05, 1.041450e+05, 7.764000e+13],
       ...,
       [6.683020e+05, 3.797260e+05, 9.549240e+15],
       [6.683020e+05, 1.946330e+05, 1.596684e+16],
       [6.683020e+05, 6.683020e+05, 0.000000e+00]])

  theta = edge_index2[:,2].mean()

edge_index2[:,2] = (np.exp(-edge_index2[:,2]/theta) != 1)*(np.exp(-edge_index2[:,2]/theta))

edge_index2.shape

(199868, 3)

edge_index2[:,2].mean()

0.5099377442499056

- select edge 할때 … 뭘 기준으로 select를 하는게 좋을지가 고민.

원래는 평균보다 큰 값? 으로 해서 했는데.. 그거 말고 다른 방법이 잇을 거 같으

bipartite

def bipartite(df, node_1, node_2, graph_type=nx.Graph()):
    df=df.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df[node_1].values.tolist()+\
                                                      df[node_2].values.tolist()))}
    
    df["from"]=df[node_1].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df[node_2].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")   
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") 

    return G

G  = bipartite(df, node_1 = 'cc_num', node_2 = 'merchant')

tripartite


def tripartite(df, node_1, node_2,  graph_type=nx.Graph()):
    df=df.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df[node_1].values.tolist() +
                                                       df[node_2].values.tolist()))}
    df["in_node"]= df[node_1].apply(lambda x: mapping[x])
    df["out_node"]=df[node_2].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")   
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")  
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")

    return G

G = tripartite(df, 'cc_num', 'merchant')

G_split



def G_split(G, test_size):
    train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G.edges))), 
                                                                      list(nx.get_edge_attributes(G, "label").values()), 
                                                                      test_size=test_size, 
                                                                      random_state=42)
    edgs = list(G.edges)
    
    train_graph = G.edge_subgraph([edgs[x] for x in train_edges]).copy()
    train_graph.add_nodes_from(list(set(G.nodes) - set(train_graph.nodes)))
    test_graph = G.edge_subgraph([edgs[x] for x in test_edges]).copy()
    test_graph.add_nodes_from(list(set(G.nodes) - set(test_graph.nodes)))
    
    return train_graph, test_graph

train_graph, test_graph = G_split(G, test_size=0.2)

G_

from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

node2vec_train = Node2Vec(G_split(G,0.2), weight_key='weight')
model_train = node2vec_train.fit(window=10)

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:03<00:00,  2.53it/s]